{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "09a35eb4-9a7a-480d-9522-69efe4ed3a71",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from pandas.api.types import is_extension_array_dtype, is_integer_dtype"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e3b033d-0427-4fdb-aea4-60cd9803e1ee",
   "metadata": {},
   "source": [
    "## OD Create for Hypo 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "37ede122-fb0a-45d6-a1ee-cf6dd3406d62",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2020_streamed.csv  shape=(1092, 12)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2021_streamed.csv  shape=(1092, 12)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2022_streamed.csv  shape=(1092, 12)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2023_streamed.csv  shape=(1091, 12)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2024_streamed.csv  shape=(1092, 12)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2025_streamed.csv  shape=(546, 12)\n",
      "[OK] concat 6 files -> shape=(6005, 12)\n",
      "[WARN] could not write DEST parquet: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.\n",
      "A suitable version of pyarrow or fastparquet is required for parquet support.\n",
      "Trying to import the above resulted in these errors:\n",
      " - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.\n",
      " - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.\n",
      "\n",
      "[HEAD DEST]\n",
      "     대상연월 도착시군구코드  HW_inbound_가중평균  HW_within_가중평균  WH_inbound_가중평균  \\\n",
      "0  202001   11010        23.608935       12.247058        25.434164   \n",
      "1  202001   11020        23.967862       12.183578        25.741310   \n",
      "2  202001   11030        24.258422       12.103089        25.844074   \n",
      "3  202001   11040        23.422781       12.850011        24.628922   \n",
      "4  202001   11050        21.743455       11.880319        22.646971   \n",
      "5  202001   11060        21.240356       12.599504        22.248430   \n",
      "6  202001   11070        20.020295       12.149402        21.505102   \n",
      "7  202001   11080        21.222967       13.320483        22.004891   \n",
      "8  202001   11090        19.015933       12.750122        20.355198   \n",
      "9  202001   11100        18.621790       12.027176        20.467877   \n",
      "\n",
      "   WH_within_가중평균  TOTAL_inbound_가중평균  TOTAL_within_가중평균  \\\n",
      "0       13.648064           49.043099          25.895122   \n",
      "1       13.617624           49.709173          25.801202   \n",
      "2       13.141463           50.102496          25.244553   \n",
      "3       13.939824           48.051703          26.789835   \n",
      "4       12.833745           44.390427          24.714064   \n",
      "5       13.725097           43.488786          26.324601   \n",
      "6       12.910752           41.525397          25.060155   \n",
      "7       14.805808           43.227858          28.126291   \n",
      "8       13.593387           39.371131          26.343510   \n",
      "9       12.814672           39.089667          24.841848   \n",
      "\n",
      "   GAP_inbound_vs_within_HW  GAP_inbound_vs_within_WH  \\\n",
      "0                 11.361877                 11.786100   \n",
      "1                 11.784285                 12.123686   \n",
      "2                 12.155333                 12.702611   \n",
      "3                 10.572770                 10.689099   \n",
      "4                  9.863136                  9.813226   \n",
      "5                  8.640852                  8.523333   \n",
      "6                  7.870893                  8.594350   \n",
      "7                  7.902484                  7.199083   \n",
      "8                  6.265811                  6.761810   \n",
      "9                  6.594614                  7.653205   \n",
      "\n",
      "   GAP_inbound_vs_within_TOTAL  __year_loaded      ym  \n",
      "0                    23.147977           2020  202001  \n",
      "1                    23.907971           2020  202001  \n",
      "2                    24.857943           2020  202001  \n",
      "3                    21.261868           2020  202001  \n",
      "4                    19.676362           2020  202001  \n",
      "5                    17.164186           2020  202001  \n",
      "6                    16.465243           2020  202001  \n",
      "7                    15.101567           2020  202001  \n",
      "8                    13.027621           2020  202001  \n",
      "9                    14.247819           2020  202001  \n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2020_streamed.csv  shape=(46755, 10)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2021_streamed.csv  shape=(46796, 10)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2022_streamed.csv  shape=(46808, 10)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2023_streamed.csv  shape=(46036, 10)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2024_streamed.csv  shape=(46762, 10)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2025_streamed.csv  shape=(23362, 10)\n",
      "[OK] concat 6 files -> shape=(256519, 10)\n",
      "[WARN] could not write OD parquet: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.\n",
      "A suitable version of pyarrow or fastparquet is required for parquet support.\n",
      "Trying to import the above resulted in these errors:\n",
      " - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.\n",
      " - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.\n",
      "\n",
      "[HEAD OD]\n",
      "     대상연월 출발시군구코드 도착시군구코드       HW_총이동인구  HW_가중평균시간       WH_총이동인구  WH_가중평균시간  \\\n",
      "0  202001   11010   11010  925261.324768  12.247058  883860.803711  13.648064   \n",
      "1  202001   11010   11020  402499.714867  15.361428  361907.262451  17.183521   \n",
      "2  202001   11010   11030   67756.599922  19.041720   58329.179932  20.534165   \n",
      "3  202001   11010   11040   69497.239990  19.169219   60467.360016  20.184423   \n",
      "4  202001   11010   11050   24062.300016  20.304302   22494.120025  19.941638   \n",
      "5  202001   11010   11060   92722.660871  16.642527   86407.800049  17.093644   \n",
      "6  202001   11010   11070   14411.240011  22.168169   13913.359860  21.529580   \n",
      "7  202001   11010   11080  146531.380722  15.644088  142425.019958  16.047206   \n",
      "8  202001   11010   11090   32659.980258  21.030129   32339.940010  20.064292   \n",
      "9  202001   11010   11100   17371.760015  21.729036   16979.780001  21.587335   \n",
      "\n",
      "   TOTAL_통근시간(분)  TOTAL_플로우_보조지표(최소값)  __year_loaded      ym  \n",
      "0      25.895122        883860.803711           2020  202001  \n",
      "1      32.544949        361907.262451           2020  202001  \n",
      "2      39.575885         58329.179932           2020  202001  \n",
      "3      39.353642         60467.360016           2020  202001  \n",
      "4      40.245940         22494.120025           2020  202001  \n",
      "5      33.736171         86407.800049           2020  202001  \n",
      "6      43.697749         13913.359860           2020  202001  \n",
      "7      31.691294        142425.019958           2020  202001  \n",
      "8      41.094421         32339.940010           2020  202001  \n",
      "9      43.316371         16979.780001           2020  202001  \n",
      "\n",
      "[OK] merged OD x DEST -> shape=(256519, 22)\n",
      "[WARN] unmatched origin codes: ['11010', '11020', '11030', '11040', '11050', '11060', '11070', '11080', '11090', '11100']\n",
      "[WARN] unmatched destination codes: ['11010', '11020', '11030', '11040', '11050', '11060', '11070', '11080', '11090', '11100']\n"
     ]
    }
   ],
   "source": [
    "# ============== Paths & file patterns ==============\n",
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\")\n",
    "YEARS = range(2020, 2026)  # 2020–2025 inclusive\n",
    "\n",
    "DEST_FMT = \"DEST_월별_인바운드_위딘_갭_{year}_streamed.csv\"\n",
    "OD_FMT   = \"OD_월별_HW_WH_TOTAL_{year}_streamed.csv\"\n",
    "CODEBOOK = BASE / \"서울생활이동데이터_자치구코드.xlsx\"\n",
    "\n",
    "OUT_DEST = BASE / \"DEST_월별_인바운드_위딘_갭_2020_2025_concat.parquet\"\n",
    "OUT_OD   = BASE / \"OD_월별_HW_WH_TOTAL_2020_2025_concat.parquet\"\n",
    "OUT_MERGED_CSV = BASE / \"COMBINED_OD_DEST_2020_2025.csv\"\n",
    "OUT_MERGED_PQ  = BASE / \"COMBINED_OD_DEST_2020_2025.parquet\"\n",
    "\n",
    "# ============== Small utils ==============\n",
    "def read_csv_auto(fp, **kwargs):\n",
    "    \"\"\"UTF-8-SIG first, fallback to CP949. Keeps low_memory False.\"\"\"\n",
    "    kw = dict(low_memory=False); kw.update(kwargs)\n",
    "    try:\n",
    "        return pd.read_csv(fp, encoding=\"utf-8-sig\", **kw)\n",
    "    except UnicodeDecodeError:\n",
    "        return pd.read_csv(fp, encoding=\"cp949\", **kw)\n",
    "\n",
    "def norm_ym(s):\n",
    "    \"\"\"Make 'ym' a 6-digit YYYYMM string.\"\"\"\n",
    "    out = s.astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "    return out\n",
    "\n",
    "def pick(df, candidates, required=True):\n",
    "    for c in candidates:\n",
    "        if c in df.columns:\n",
    "            return c\n",
    "    if required:\n",
    "        raise KeyError(f\"None of {candidates} in columns: {list(df.columns)[:10]}...\")\n",
    "    return None\n",
    "\n",
    "def concat_years(fmt):\n",
    "    dfs = []\n",
    "    for y in YEARS:\n",
    "        fp = BASE / fmt.format(year=y)\n",
    "        if not fp.exists():\n",
    "            print(f\"[WARN] missing: {fp}\")\n",
    "            continue\n",
    "        df = read_csv_auto(fp)\n",
    "        df[\"__year_loaded\"] = y\n",
    "        dfs.append(df)\n",
    "        print(f\"[OK] read  {fp}  shape={df.shape}\")\n",
    "    if not dfs:\n",
    "        raise FileNotFoundError(f\"No files found for pattern {fmt}\")\n",
    "    out = pd.concat(dfs, ignore_index=True)\n",
    "    print(f\"[OK] concat {len(dfs)} files -> shape={out.shape}\")\n",
    "    return out\n",
    "\n",
    "# ============== 1) Read & standardize DEST ==============\n",
    "dest = concat_years(DEST_FMT)\n",
    "\n",
    "# Your screenshot shows columns:\n",
    "# 대상연월, district, HW_inbound_가중평균, HW_within_가중평균, WH_inbound_가중평균, WH_within_가중평균,\n",
    "# TOTAL_inbound_가중평균, TOTAL_within_가중평균, GAP_inbound_vs_within_HW, GAP_inbound_vs_within_WH, GAP_inbound_vs_within_TOTAL\n",
    "ym_col_dest = pick(dest, [\"ym\", \"대상연월\", \"연월\"])\n",
    "dest[\"ym\"] = norm_ym(dest[ym_col_dest])\n",
    "\n",
    "# rename 'district' -> '도착시군구코드' to line up with OD key\n",
    "dist_col = pick(dest, [\"district\"])\n",
    "dest = dest.rename(columns={dist_col: \"도착시군구코드\"})\n",
    "\n",
    "# ensure codes are strings\n",
    "dest[\"도착시군구코드\"] = dest[\"도착시군구코드\"].astype(str).str.strip()\n",
    "\n",
    "# optional: check uniqueness per (도착시군구코드, ym)\n",
    "dup_mask = dest.duplicated(subset=[\"도착시군구코드\", \"ym\"], keep=False)\n",
    "if dup_mask.any():\n",
    "    ndup = dup_mask.sum()\n",
    "    print(f\"[WARN] DEST has {ndup} duplicate rows on (도착시군구코드, ym). Keeping first within each key.\")\n",
    "    dest = dest.sort_values([\"도착시군구코드\", \"ym\"]).drop_duplicates([\"도착시군구코드\", \"ym\"], keep=\"first\")\n",
    "\n",
    "# Save the concatenated DEST (optional)\n",
    "try:\n",
    "    dest.to_parquet(OUT_DEST, index=False)\n",
    "    print(f\"[OK] wrote {OUT_DEST}\")\n",
    "except Exception as e:\n",
    "    print(f\"[WARN] could not write DEST parquet: {e}\")\n",
    "\n",
    "# quick peek\n",
    "print(\"\\n[HEAD DEST]\")\n",
    "print(dest.head(10))\n",
    "\n",
    "# ============== 2) Read & standardize OD ==============\n",
    "od = concat_years(OD_FMT)\n",
    "\n",
    "# Your screenshot shows columns:\n",
    "# 대상연월, 출발시군구코드, 도착시군구코드, HW_총이동인구, HW_가중평균시간, WH_총이동인구, WH_가중평균시간,\n",
    "# TOTAL_통근시간(분), TOTAL_플로우_보조지표(최소값)\n",
    "ym_col_od = pick(od, [\"ym\", \"대상연월\", \"연월\"])\n",
    "od[\"ym\"] = norm_ym(od[ym_col_od])\n",
    "\n",
    "o_code = pick(od, [\"출발시군구코드\", \"출발 시군구코드\"])\n",
    "d_code = pick(od, [\"도착시군구코드\", \"도착 시군구코드\"])\n",
    "if o_code != \"출발시군구코드\":\n",
    "    od = od.rename(columns={o_code: \"출발시군구코드\"})\n",
    "if d_code != \"도착시군구코드\":\n",
    "    od = od.rename(columns={d_code: \"도착시군구코드\"})\n",
    "\n",
    "od[\"출발시군구코드\"] = od[\"출발시군구코드\"].astype(str).str.strip()\n",
    "od[\"도착시군구코드\"] = od[\"도착시군구코드\"].astype(str).str.strip()\n",
    "\n",
    "# Save concatenated OD (optional)\n",
    "try:\n",
    "    od.to_parquet(OUT_OD, index=False)\n",
    "    print(f\"[OK] wrote {OUT_OD}\")\n",
    "except Exception as e:\n",
    "    print(f\"[WARN] could not write OD parquet: {e}\")\n",
    "\n",
    "# quick peek\n",
    "print(\"\\n[HEAD OD]\")\n",
    "print(od.head(10))\n",
    "\n",
    "# ============== 3) Merge DEST → OD by (도착시군구코드, ym) ==============\n",
    "merged = od.merge(dest, how=\"left\", on=[\"도착시군구코드\", \"ym\"])\n",
    "print(f\"\\n[OK] merged OD x DEST -> shape={merged.shape}\")\n",
    "\n",
    "# ============== 4) Map district codes → names (출발 + 도착) ==============\n",
    "code_df = pd.read_excel(CODEBOOK)\n",
    "\n",
    "# be tolerant about headers (strip spaces)\n",
    "code_df.columns = [c.strip() for c in code_df.columns]\n",
    "\n",
    "def pick(df, candidates, required=True):\n",
    "    cols = {c.lower(): c for c in df.columns}\n",
    "    for cand in candidates:\n",
    "        if cand.lower() in cols:\n",
    "            return cols[cand.lower()]\n",
    "    if required:\n",
    "        raise KeyError(f\"None of {candidates} found. Available: {list(df.columns)}\")\n",
    "    return None\n",
    "\n",
    "# your file shows: ['출발시군구코드','시군구','full name']\n",
    "code_col = pick(code_df, [\n",
    "    \"출발시군구코드\",        # present in your xlsx\n",
    "    \"도착시군구코드\",\n",
    "    \"시군구코드\", \"자치구코드\", \"코드\", \"code\"\n",
    "])\n",
    "name_col = pick(code_df, [\n",
    "    \"full name\",            # prefer this\n",
    "    \"FULL NAME\", \"FULL_NAME\",\n",
    "    \"시군구\",               # fallback if needed\n",
    "    \"자치구명\", \"시군구명\", \"구명\", \"name\"\n",
    "])\n",
    "\n",
    "code_df[code_col] = code_df[code_col].astype(str).str.strip()\n",
    "code_df[name_col] = code_df[name_col].astype(str).str.strip()\n",
    "\n",
    "# build mapping dict\n",
    "code_to_name = dict(zip(code_df[code_col], code_df[name_col]))\n",
    "\n",
    "# map to both origin and destination codes\n",
    "merged[\"출발시군구명\"] = merged[\"출발시군구코드\"].astype(str).str.strip().map(code_to_name)\n",
    "merged[\"도착시군구명\"] = merged[\"도착시군구코드\"].astype(str).str.strip().map(code_to_name)\n",
    "\n",
    "# quick diagnostics\n",
    "missing_o = sorted(set(merged.loc[merged[\"출발시군구명\"].isna(), \"출발시군구코드\"]))\n",
    "missing_d = sorted(set(merged.loc[merged[\"도착시군구명\"].isna(), \"도착시군구코드\"]))\n",
    "if missing_o or missing_d:\n",
    "    print(\"[WARN] unmatched origin codes:\", missing_o[:10])\n",
    "    print(\"[WARN] unmatched destination codes:\", missing_d[:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2262ccb-6904-4754-8bbc-d5a87fc22216",
   "metadata": {},
   "source": [
    "# DEST Create for Hypo 2 and 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c5acd34d-495a-475a-bd31-5338307ffff0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2020_streamed.csv  shape=(1092, 12)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2021_streamed.csv  shape=(1092, 12)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2022_streamed.csv  shape=(1092, 12)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2023_streamed.csv  shape=(1091, 12)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2024_streamed.csv  shape=(1092, 12)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2025_streamed.csv  shape=(546, 12)\n",
      "[OK] concatenated 6 files -> shape=(6005, 12)\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\DEST_월별_인바운드_위딘_갭_2020_2025_concat.parquet\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2020_streamed.csv  shape=(46755, 10)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2021_streamed.csv  shape=(46796, 10)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2022_streamed.csv  shape=(46808, 10)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2023_streamed.csv  shape=(46036, 10)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2024_streamed.csv  shape=(46762, 10)\n",
      "[OK] read  E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2025_streamed.csv  shape=(23362, 10)\n",
      "[OK] concatenated 6 files -> shape=(256519, 10)\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\OD_월별_HW_WH_TOTAL_2020_2025_concat.parquet\n",
      "[OK] merged OD × DEST -> shape=(256519, 22)\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\COMBINED_OD_DEST_2020_2025.csv\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\\COMBINED_OD_DEST_2020_2025.parquet\n",
      "\n",
      "[HEAD MERGED]\n",
      "        ym 출발시군구코드     출발시군구명 도착시군구코드      도착시군구명  대상연월_x       HW_총이동인구  \\\n",
      "0   202001   11010  서울특별시 종로구   11010   서울특별시 종로구  202001  925261.324768   \n",
      "1   202001   11010  서울특별시 종로구   11020    서울특별시 중구  202001  402499.714867   \n",
      "2   202001   11010  서울특별시 종로구   11030   서울특별시 용산구  202001   67756.599922   \n",
      "3   202001   11010  서울특별시 종로구   11040   서울특별시 성동구  202001   69497.239990   \n",
      "4   202001   11010  서울특별시 종로구   11050   서울특별시 광진구  202001   24062.300016   \n",
      "5   202001   11010  서울특별시 종로구   11060  서울특별시 동대문구  202001   92722.660871   \n",
      "6   202001   11010  서울특별시 종로구   11070   서울특별시 중랑구  202001   14411.240011   \n",
      "7   202001   11010  서울특별시 종로구   11080   서울특별시 성북구  202001  146531.380722   \n",
      "8   202001   11010  서울특별시 종로구   11090   서울특별시 강북구  202001   32659.980258   \n",
      "9   202001   11010  서울특별시 종로구   11100   서울특별시 도봉구  202001   17371.760015   \n",
      "10  202001   11010  서울특별시 종로구   11110   서울특별시 노원구  202001   22607.779891   \n",
      "11  202001   11010  서울특별시 종로구   11120   서울특별시 은평구  202001   65265.859634   \n",
      "\n",
      "    HW_가중평균시간       WH_총이동인구  WH_가중평균시간  ...  HW_inbound_가중평균  HW_within_가중평균  \\\n",
      "0   12.247058  883860.803711  13.648064  ...        23.608935       12.247058   \n",
      "1   15.361428  361907.262451  17.183521  ...        23.967862       12.183578   \n",
      "2   19.041720   58329.179932  20.534165  ...        24.258422       12.103089   \n",
      "3   19.169219   60467.360016  20.184423  ...        23.422781       12.850011   \n",
      "4   20.304302   22494.120025  19.941638  ...        21.743455       11.880319   \n",
      "5   16.642527   86407.800049  17.093644  ...        21.240356       12.599504   \n",
      "6   22.168169   13913.359860  21.529580  ...        20.020295       12.149402   \n",
      "7   15.644088  142425.019958  16.047206  ...        21.222967       13.320483   \n",
      "8   21.030129   32339.940010  20.064292  ...        19.015933       12.750122   \n",
      "9   21.729036   16979.780001  21.587335  ...        18.621790       12.027176   \n",
      "10  22.419592   21033.600037  22.978853  ...        19.704913       12.609325   \n",
      "11  17.258346   63039.859528  17.446517  ...        21.131889       12.839111   \n",
      "\n",
      "    WH_inbound_가중평균  WH_within_가중평균  TOTAL_inbound_가중평균  TOTAL_within_가중평균  \\\n",
      "0         25.434164       13.648064           49.043099          25.895122   \n",
      "1         25.741310       13.617624           49.709173          25.801202   \n",
      "2         25.844074       13.141463           50.102496          25.244553   \n",
      "3         24.628922       13.939824           48.051703          26.789835   \n",
      "4         22.646971       12.833745           44.390427          24.714064   \n",
      "5         22.248430       13.725097           43.488786          26.324601   \n",
      "6         21.505102       12.910752           41.525397          25.060155   \n",
      "7         22.004891       14.805808           43.227858          28.126291   \n",
      "8         20.355198       13.593387           39.371131          26.343510   \n",
      "9         20.467877       12.814672           39.089667          24.841848   \n",
      "10        21.157730       14.209450           40.862643          26.818775   \n",
      "11        22.303197       14.243727           43.435087          27.082838   \n",
      "\n",
      "    GAP_inbound_vs_within_HW  GAP_inbound_vs_within_WH  \\\n",
      "0                  11.361877                 11.786100   \n",
      "1                  11.784285                 12.123686   \n",
      "2                  12.155333                 12.702611   \n",
      "3                  10.572770                 10.689099   \n",
      "4                   9.863136                  9.813226   \n",
      "5                   8.640852                  8.523333   \n",
      "6                   7.870893                  8.594350   \n",
      "7                   7.902484                  7.199083   \n",
      "8                   6.265811                  6.761810   \n",
      "9                   6.594614                  7.653205   \n",
      "10                  7.095588                  6.948280   \n",
      "11                  8.292779                  8.059470   \n",
      "\n",
      "    GAP_inbound_vs_within_TOTAL  __year_loaded_y  \n",
      "0                     23.147977             2020  \n",
      "1                     23.907971             2020  \n",
      "2                     24.857943             2020  \n",
      "3                     21.261868             2020  \n",
      "4                     19.676362             2020  \n",
      "5                     17.164186             2020  \n",
      "6                     16.465243             2020  \n",
      "7                     15.101567             2020  \n",
      "8                     13.027621             2020  \n",
      "9                     14.247819             2020  \n",
      "10                    14.043868             2020  \n",
      "11                    16.352249             2020  \n",
      "\n",
      "[12 rows x 24 columns]\n"
     ]
    }
   ],
   "source": [
    "# =========================\n",
    "# 0) Paths & parameters\n",
    "# =========================\n",
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\\이동데이터\")\n",
    "YEARS = range(2020, 2026)  # 2020–2025 inclusive\n",
    "\n",
    "DEST_FMT = \"DEST_월별_인바운드_위딘_갭_{year}_streamed.csv\"\n",
    "OD_FMT   = \"OD_월별_HW_WH_TOTAL_{year}_streamed.csv\"\n",
    "CODEBOOK = BASE / \"서울생활이동데이터_자치구코드.xlsx\"\n",
    "\n",
    "OUT_DEST_PARQ = BASE / \"DEST_월별_인바운드_위딘_갭_2020_2025_concat.parquet\"\n",
    "OUT_OD_PARQ   = BASE / \"OD_월별_HW_WH_TOTAL_2020_2025_concat.parquet\"\n",
    "OUT_COMBINED_CSV = BASE / \"COMBINED_OD_DEST_2020_2025.csv\"\n",
    "OUT_COMBINED_PARQ = BASE / \"COMBINED_OD_DEST_2020_2025.parquet\"\n",
    "\n",
    "# =========================\n",
    "# 1) Helpers\n",
    "# =========================\n",
    "def read_csv_auto(fp, **kwargs):\n",
    "    \"\"\"Try UTF-8-SIG then CP949. Keeps low_memory=False by default.\"\"\"\n",
    "    kw = dict(low_memory=False); kw.update(kwargs)\n",
    "    try:\n",
    "        return pd.read_csv(fp, encoding=\"utf-8-sig\", **kw)\n",
    "    except UnicodeDecodeError:\n",
    "        return pd.read_csv(fp, encoding=\"cp949\", **kw)\n",
    "\n",
    "def norm_ym(s):\n",
    "    \"\"\"Normalize to 6-digit YYYYMM (strip non-digits).\"\"\"\n",
    "    return s.astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "\n",
    "def pick(df, candidates, required=True):\n",
    "    \"\"\"Pick the first column that exists (case-insensitive, trimmed).\"\"\"\n",
    "    cols_map = {c.strip().lower(): c for c in df.columns}\n",
    "    for cand in candidates:\n",
    "        key = cand.strip().lower()\n",
    "        if key in cols_map:\n",
    "            return cols_map[key]\n",
    "    if required:\n",
    "        raise KeyError(f\"None of {candidates} found. Available: {list(df.columns)}\")\n",
    "    return None\n",
    "\n",
    "def concat_years(fmt):\n",
    "    dfs = []\n",
    "    for y in YEARS:\n",
    "        fp = BASE / fmt.format(year=y)\n",
    "        if not fp.exists():\n",
    "            print(f\"[WARN] missing file: {fp}\")\n",
    "            continue\n",
    "        df = read_csv_auto(fp)\n",
    "        df[\"__year_loaded\"] = y\n",
    "        dfs.append(df)\n",
    "        print(f\"[OK] read  {fp}  shape={df.shape}\")\n",
    "    if not dfs:\n",
    "        raise FileNotFoundError(f\"No files found for pattern {fmt}\")\n",
    "    out = pd.concat(dfs, ignore_index=True)\n",
    "    print(f\"[OK] concatenated {len(dfs)} files -> shape={out.shape}\")\n",
    "    return out\n",
    "\n",
    "# =========================\n",
    "# 2) Load & standardize DEST\n",
    "# =========================\n",
    "dest = concat_years(DEST_FMT)\n",
    "\n",
    "# Example columns (from your screenshot):\n",
    "# 대상연월, district, HW_inbound_가중평균, HW_within_가중평균, WH_inbound_가중평균, WH_within_가중평균,\n",
    "# TOTAL_inbound_가중평균, TOTAL_within_가중평균, GAP_inbound_vs_within_HW, GAP_inbound_vs_within_WH, GAP_inbound_vs_within_TOTAL\n",
    "ym_col_dest = pick(dest, [\"ym\", \"대상연월\", \"연월\"])\n",
    "dest[\"ym\"] = norm_ym(dest[ym_col_dest])\n",
    "\n",
    "# Rename 'district' → '도착시군구코드' to match OD key\n",
    "dist_col = pick(dest, [\"district\"])\n",
    "dest = dest.rename(columns={dist_col: \"도착시군구코드\"})\n",
    "dest[\"도착시군구코드\"] = dest[\"도착시군구코드\"].astype(str).str.strip()\n",
    "\n",
    "# Ensure unique (도착시군구코드, ym)\n",
    "dup = dest.duplicated(subset=[\"도착시군구코드\", \"ym\"], keep=False)\n",
    "if dup.any():\n",
    "    print(f\"[WARN] DEST has {dup.sum()} duplicate key rows; keeping first occurrence per key.\")\n",
    "    dest = (dest.sort_values([\"도착시군구코드\", \"ym\"])\n",
    "                 .drop_duplicates([\"도착시군구코드\", \"ym\"], keep=\"first\"))\n",
    "\n",
    "# Save concatenated DEST (optional)\n",
    "try:\n",
    "    dest.to_parquet(OUT_DEST_PARQ, index=False)\n",
    "    print(f\"[OK] wrote {OUT_DEST_PARQ}\")\n",
    "except Exception as e:\n",
    "    print(f\"[WARN] could not write DEST parquet: {e}\")\n",
    "\n",
    "# =========================\n",
    "# 3) Load & standardize OD\n",
    "# =========================\n",
    "od = concat_years(OD_FMT)\n",
    "\n",
    "# Example columns (from your screenshot):\n",
    "# 대상연월, 출발시군구코드, 도착시군구코드, HW_총이동인구, HW_가중평균시간, WH_총이동인구, WH_가중평균시간,\n",
    "# TOTAL_통근시간(분), TOTAL_플로우_보조지표(최소값)\n",
    "ym_col_od = pick(od, [\"ym\", \"대상연월\", \"연월\"])\n",
    "od[\"ym\"] = norm_ym(od[ym_col_od])\n",
    "\n",
    "# Ensure canonical column names\n",
    "o_col = pick(od, [\"출발시군구코드\", \"출발 시군구코드\"])\n",
    "d_col = pick(od, [\"도착시군구코드\", \"도착 시군구코드\"])\n",
    "if o_col != \"출발시군구코드\":\n",
    "    od = od.rename(columns={o_col: \"출발시군구코드\"})\n",
    "if d_col != \"도착시군구코드\":\n",
    "    od = od.rename(columns={d_col: \"도착시군구코드\"})\n",
    "\n",
    "od[\"출발시군구코드\"] = od[\"출발시군구코드\"].astype(str).str.strip()\n",
    "od[\"도착시군구코드\"] = od[\"도착시군구코드\"].astype(str).str.strip()\n",
    "\n",
    "# Save concatenated OD (optional)\n",
    "try:\n",
    "    od.to_parquet(OUT_OD_PARQ, index=False)\n",
    "    print(f\"[OK] wrote {OUT_OD_PARQ}\")\n",
    "except Exception as e:\n",
    "    print(f\"[WARN] could not write OD parquet: {e}\")\n",
    "\n",
    "# =========================\n",
    "# 4) Merge DEST → OD by (도착시군구코드, ym)\n",
    "# =========================\n",
    "merged = od.merge(dest, how=\"left\", on=[\"도착시군구코드\", \"ym\"])\n",
    "print(f\"[OK] merged OD × DEST -> shape={merged.shape}\")\n",
    "\n",
    "# =========================\n",
    "# 5) Map district codes → names (출발 + 도착) using codebook: ['시군구','full name']\n",
    "# =========================\n",
    "code_df = pd.read_excel(CODEBOOK)\n",
    "code_df.columns = [c.strip() for c in code_df.columns]\n",
    "\n",
    "# Exactly your headers:\n",
    "# '시군구' = numeric code; 'full name' = Korean district name (e.g., '서울특별시 종로구')\n",
    "code_df[\"시군구\"] = code_df[\"시군구\"].astype(str).str.strip()\n",
    "code_df[\"full name\"] = code_df[\"full name\"].astype(str).str.strip()\n",
    "\n",
    "# In case of duplicates in the codebook, keep first\n",
    "code_df = code_df.drop_duplicates(subset=[\"시군구\"], keep=\"first\")\n",
    "\n",
    "code_to_name = dict(zip(code_df[\"시군구\"], code_df[\"full name\"]))\n",
    "\n",
    "merged[\"출발시군구명\"] = merged[\"출발시군구코드\"].map(code_to_name)\n",
    "merged[\"도착시군구명\"] = merged[\"도착시군구코드\"].map(code_to_name)\n",
    "\n",
    "# Diagnostics for unmatched codes (e.g., non-Seoul origins)\n",
    "miss_o = sorted(set(merged.loc[merged[\"출발시군구명\"].isna(), \"출발시군구코드\"]))\n",
    "miss_d = sorted(set(merged.loc[merged[\"도착시군구명\"].isna(), \"도착시군구코드\"]))\n",
    "if miss_o or miss_d:\n",
    "    print(f\"[WARN] unmatched origin codes (first 10): {miss_o[:10]}\")\n",
    "    print(f\"[WARN] unmatched destination codes (first 10): {miss_d[:10]}\")\n",
    "\n",
    "# Optionally add shorter names without the '서울특별시 ' prefix\n",
    "# merged[\"출발시군구명_짧게\"] = merged[\"출발시군구명\"].str.replace(r\"^서울특별시\\s*\", \"\", regex=True)\n",
    "# merged[\"도착시군구명_짧게\"] = merged[\"도착시군구명\"].str.replace(r\"^서울특별시\\s*\", \"\", regex=True)\n",
    "\n",
    "# =========================\n",
    "# 6) Order columns & save final\n",
    "# =========================\n",
    "front = [\"ym\", \"출발시군구코드\", \"출발시군구명\", \"도착시군구코드\", \"도착시군구명\"]\n",
    "other = [c for c in merged.columns if c not in front]\n",
    "merged = merged[front + other]\n",
    "\n",
    "merged.to_csv(OUT_COMBINED_CSV, index=False, encoding=\"utf-8-sig\")\n",
    "print(f\"[OK] wrote {OUT_COMBINED_CSV}\")\n",
    "\n",
    "try:\n",
    "    merged.to_parquet(OUT_COMBINED_PARQ, index=False)\n",
    "    print(f\"[OK] wrote {OUT_COMBINED_PARQ}\")\n",
    "except Exception as e:\n",
    "    print(f\"[WARN] could not write merged parquet: {e}\")\n",
    "\n",
    "# (Optional) quick peek\n",
    "print(\"\\n[HEAD MERGED]\")\n",
    "print(merged.head(12))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05d6efcb-0269-4329-978d-a4456d8db10e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "5e120ab8-d66d-4569-ad8b-0c1f28203cd3",
   "metadata": {},
   "source": [
    "## House price Merge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a76dcd55-05c4-472c-bdaf-697e545b005c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INFO] Mapped APT names to codes via codebook. Unmatched rows: 924/5808\n",
      "[CHECK] df_2 sample keys: [{'ym': '202001', '출발시군구코드': '11010', '도착시군구코드': '11010'}, {'ym': '202001', '출발시군구코드': '11010', '도착시군구코드': '11020'}, {'ym': '202001', '출발시군구코드': '11010', '도착시군구코드': '11030'}]\n",
      "[CHECK] APT sample: [{'ym': '202001', '시군구': '11010', 'APT_price': 1022319.0, 'ln_APT_price': 13.8375841341148}, {'ym': '202001', '시군구': '11020', 'APT_price': 795743.0, 'ln_APT_price': 13.587031548373005}, {'ym': '202001', '시군구': '23010', 'APT_price': 795743.0, 'ln_APT_price': 13.587031548373005}]\n",
      "[AUDIT] share with dest price: 0.883\n",
      "[AUDIT] share with orig price: 0.885\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\ROUTE_PANEL_H1_2020_2025.csv\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\DEST_PANEL_H2H3_2020_2025.csv\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\ROUTE_PANEL_H1_2020_2025.parquet (engine=fastparquet)\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\DEST_PANEL_H2H3_2020_2025.parquet (engine=fastparquet)\n",
      "[OK] wrote E:\\불평등 연구\\데이터\\59_출퇴근불평등\\COMMUTE_PRICE_PANELS.xlsx\n",
      "\n",
      "[HEAD ROUTE]\n",
      "       ym 출발시군구코드  출발시군구명 도착시군구코드   도착시군구명  APT_price_o  APT_price_d  \\\n",
      "0  202001   11010  서울 종로구   11010   서울 종로구    1022319.0    1022319.0   \n",
      "1  202001   11010  서울 종로구   11020    서울 중구    1022319.0     795743.0   \n",
      "2  202001   11010  서울 종로구   11020    서울 중구    1022319.0     255071.0   \n",
      "3  202001   11010  서울 종로구   11030   서울 용산구    1022319.0    1389939.0   \n",
      "4  202001   11010  서울 종로구   11040   서울 성동구    1022319.0     899118.0   \n",
      "5  202001   11010  서울 종로구   11050   서울 광진구    1022319.0    1014402.0   \n",
      "6  202001   11010  서울 종로구   11060  서울 동대문구    1022319.0     652774.0   \n",
      "7  202001   11010  서울 종로구   11070   서울 중랑구    1022319.0     451372.0   \n",
      "\n",
      "   ln_APT_price_o  ln_APT_price_d  Delta_log_price  Rel_level_price  \\\n",
      "0       13.837584       13.837584         0.000000         0.000000   \n",
      "1       13.837584       13.587032        -0.250553        -0.221629   \n",
      "2       13.837584       12.449297        -1.388287        -0.750498   \n",
      "3       13.837584       14.144770         0.307186         0.359594   \n",
      "4       13.837584       13.709170        -0.128415        -0.120511   \n",
      "5       13.837584       13.829810        -0.007774        -0.007744   \n",
      "6       13.837584       13.388986        -0.448598        -0.361477   \n",
      "7       13.837584       13.020047        -0.817537        -0.558482   \n",
      "\n",
      "        HW_총이동인구       WH_총이동인구  HW_가중평균시간  WH_가중평균시간  TOTAL_통근시간(분)  \\\n",
      "0  925261.324768  883860.803711  12.247058  13.648064      25.895122   \n",
      "1  402499.714867  361907.262451  15.361428  17.183521      32.544949   \n",
      "2  402499.714867  361907.262451  15.361428  17.183521      32.544949   \n",
      "3   67756.599922   58329.179932  19.041720  20.534165      39.575885   \n",
      "4   69497.239990   60467.360016  19.169219  20.184423      39.353642   \n",
      "5   24062.300016   22494.120025  20.304302  19.941638      40.245940   \n",
      "6   92722.660871   86407.800049  16.642527  17.093644      33.736171   \n",
      "7   14411.240011   13913.359860  22.168169  21.529580      43.697749   \n",
      "\n",
      "   TOTAL_플로우_보조지표(최소값)  \n",
      "0        883860.803711  \n",
      "1        361907.262451  \n",
      "2        361907.262451  \n",
      "3         58329.179932  \n",
      "4         60467.360016  \n",
      "5         22494.120025  \n",
      "6         86407.800049  \n",
      "7         13913.359860  \n",
      "\n",
      "[HEAD DEST]\n",
      "       ym 도착시군구코드  ln_APT_price_d  ln_origin_in  GapPrice  HW_flow_total  \\\n",
      "0  202001   11010       13.837584     13.302565  0.535019   1.071710e+07   \n",
      "1  202001   11020       13.587032     13.362138  0.224894   2.876433e+07   \n",
      "2  202001   11030       14.144770     13.315724  0.829047   5.707278e+06   \n",
      "3  202001   11040       13.709170     13.365750  0.343420   6.526757e+06   \n",
      "4  202001   11050       13.829810     13.361259  0.468550   4.489253e+06   \n",
      "5  202001   11060       13.388986     13.257429  0.131557   5.580510e+06   \n",
      "6  202001   11070       13.020047     13.197086 -0.177039   3.681377e+06   \n",
      "7  202001   11080       13.287061     13.225736  0.061324   4.934725e+06   \n",
      "\n",
      "   HW_flow_inbound  InboundShare_HW  HW_inbound_가중평균  HW_within_가중평균  \\\n",
      "0     9.791843e+06         0.913665        23.608935       12.247058   \n",
      "1     2.521462e+07         0.876593        23.967862       12.183578   \n",
      "2     4.571811e+06         0.801049        24.258422       12.103089   \n",
      "3     4.984470e+06         0.763698        23.422781       12.850011   \n",
      "4     2.844435e+06         0.633610        21.743455       11.880319   \n",
      "5     3.861637e+06         0.691986        21.240356       12.599504   \n",
      "6     1.997667e+06         0.542641        20.020295       12.149402   \n",
      "7     2.995346e+06         0.606993        21.222967       13.320483   \n",
      "\n",
      "   WH_inbound_가중평균  WH_within_가중평균  TOTAL_inbound_가중평균  TOTAL_within_가중평균  \\\n",
      "0        25.434164       13.648064           49.043099          25.895122   \n",
      "1        25.741310       13.617624           49.709173          25.801202   \n",
      "2        25.844074       13.141463           50.102496          25.244553   \n",
      "3        24.628922       13.939824           48.051703          26.789835   \n",
      "4        22.646971       12.833745           44.390427          24.714064   \n",
      "5        22.248430       13.725097           43.488786          26.324601   \n",
      "6        21.505102       12.910752           41.525397          25.060155   \n",
      "7        22.004891       14.805808           43.227858          28.126291   \n",
      "\n",
      "   GAP_inbound_vs_within_HW  GAP_inbound_vs_within_WH  \\\n",
      "0                 11.361877                 11.786100   \n",
      "1                 11.784285                 12.123686   \n",
      "2                 12.155333                 12.702611   \n",
      "3                 10.572770                 10.689099   \n",
      "4                  9.863136                  9.813226   \n",
      "5                  8.640852                  8.523333   \n",
      "6                  7.870893                  8.594350   \n",
      "7                  7.902484                  7.199083   \n",
      "\n",
      "   GAP_inbound_vs_within_TOTAL  \n",
      "0                    23.147977  \n",
      "1                    23.907971  \n",
      "2                    24.857943  \n",
      "3                    21.261868  \n",
      "4                    19.676362  \n",
      "5                    17.164186  \n",
      "6                    16.465243  \n",
      "7                    15.101567  \n"
     ]
    }
   ],
   "source": [
    "# =========================\n",
    "# Paths (edit if needed)\n",
    "# =========================\n",
    "BASE = Path(r\"E:\\불평등 연구\\데이터\\59_출퇴근불평등\")\n",
    "DF2_XLSX = BASE / \"COMBINED_OD_DEST_2020_2025.xlsx\"                          # merged OD×DEST panel (Excel)\n",
    "APT_XLSX = BASE / \"df_APT.xlsx\"                        # district–month APT prices (district = names)\n",
    "CODEBOOK_1 = BASE / \"서울생활이동데이터_자치구코드.xlsx\"      # preferred location (has '시군구', 'full name')\n",
    "CODEBOOK_2 = BASE / \"이동데이터\" / \"서울생활이동데이터_자치구코드.xlsx\"  # fallback\n",
    "\n",
    "# Output files\n",
    "OUT_ROUTE_PARQ = BASE / \"ROUTE_PANEL_H1_2020_2025.parquet\"\n",
    "OUT_ROUTE_CSV  = BASE / \"ROUTE_PANEL_H1_2020_2025.csv\"\n",
    "OUT_DEST_PARQ  = BASE / \"DEST_PANEL_H2H3_2020_2025.parquet\"\n",
    "OUT_DEST_CSV   = BASE / \"DEST_PANEL_H2H3_2020_2025.csv\"\n",
    "OUT_XLSX       = BASE / \"COMMUTE_PRICE_PANELS.xlsx\"  # optional (2 sheets)\n",
    "\n",
    "# =========================\n",
    "# Helpers\n",
    "# =========================\n",
    "def norm_ym(s: pd.Series) -> pd.Series:\n",
    "    \"\"\"Normalize to 6-digit YYYYMM string (strip non-digits).\"\"\"\n",
    "    return s.astype(str).str.replace(r\"\\D\", \"\", regex=True).str.slice(0, 6)\n",
    "\n",
    "def _pick(df, candidates, required=True):\n",
    "    for c in candidates:\n",
    "        if c in df.columns:\n",
    "            return c\n",
    "    if required:\n",
    "        raise KeyError(f\"None of {candidates} found in columns: {list(df.columns)[:10]}...\")\n",
    "    return None\n",
    "\n",
    "def _norm_name_kor(x: str) -> str:\n",
    "    \"\"\"\n",
    "    Normalize Korean district names to improve matching.\n",
    "    Keeps the last token (e.g., '서울특별시 종로구' -> '종로구', '서울 종로구' -> '종로구').\n",
    "    \"\"\"\n",
    "    s = str(x).strip()\n",
    "    s = re.sub(r\"[\\(\\)〈〉\\[\\]{}]\", \" \", s)\n",
    "    toks = re.split(r\"\\s+\", s)\n",
    "    return \"\" if not toks else toks[-1]\n",
    "\n",
    "def load_codebook() -> pd.DataFrame | None:\n",
    "    \"\"\"Load codebook and prepare mapping: full name → code (both as strings).\"\"\"\n",
    "    path = CODEBOOK_1 if CODEBOOK_1.exists() else (CODEBOOK_2 if CODEBOOK_2.exists() else None)\n",
    "    if path is None:\n",
    "        print(\"[WARN] Codebook not found. Will assume APT 'district' are numeric codes already.\")\n",
    "        return None\n",
    "    cb = pd.read_excel(path, sheet_name=0)\n",
    "    cb.columns = [str(c).strip() for c in cb.columns]\n",
    "    code_col = _pick(cb, [\"시군구\", \"시군구코드\", \"자치구코드\", \"code\"])\n",
    "    name_col = _pick(cb, [\"full name\", \"자치구명\", \"시군구명\", \"구명\", \"name\"])\n",
    "    cb = cb[[code_col, name_col]].rename(columns={code_col: \"code\", name_col: \"full_name\"})\n",
    "    cb[\"code\"] = cb[\"code\"].astype(str).str.strip()\n",
    "    cb[\"name_norm\"] = cb[\"full_name\"].map(_norm_name_kor)\n",
    "    return cb\n",
    "\n",
    "def read_df2_from_excel() -> pd.DataFrame:\n",
    "    \"\"\"Read df_2.xlsx and normalize keys/types.\"\"\"\n",
    "    df2 = pd.read_excel(DF2_XLSX, sheet_name=0)\n",
    "    df2.columns = [str(c).strip() for c in df2.columns]\n",
    "    if \"ym\" not in df2.columns:\n",
    "        raise KeyError(\"df_2.xlsx must contain a 'ym' column.\")\n",
    "    df2[\"ym\"] = norm_ym(df2[\"ym\"])\n",
    "    for col in [\"출발시군구코드\", \"도착시군구코드\"]:\n",
    "        if col not in df2.columns:\n",
    "            cand = [c for c in df2.columns if col.replace(\"코드\", \"\") in c and \"코드\" in c]\n",
    "            if not cand:\n",
    "                raise KeyError(f\"Missing '{col}' in df_2.xlsx.\")\n",
    "            df2 = df2.rename(columns={cand[0]: col})\n",
    "        df2[col] = df2[col].astype(str).str.strip()\n",
    "    return df2\n",
    "\n",
    "def read_apt_prices_named_districts(cb: pd.DataFrame | None) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Read df_APT.xlsx where 'district' are NAMES. Accepts long (ym,district,price) or wide (district + months).\n",
    "    Returns long with keys ['ym','시군구'] where '시군구' are NUMERIC CODES (as strings).\n",
    "    \"\"\"\n",
    "    apt = pd.read_excel(APT_XLSX, sheet_name=0)\n",
    "    apt.columns = [str(c).strip() for c in apt.columns]\n",
    "    if \"district\" not in apt.columns:\n",
    "        raise KeyError(\"df_APT.xlsx must have a 'district' column with district NAMES.\")\n",
    "\n",
    "    ym_col = \"ym\" if \"ym\" in apt.columns else (\"대상연월\" if \"대상연월\" in apt.columns else None)\n",
    "    price_candidates = [\"APT_price\", \"price\", \"가격\", \"매매가격\", \"매매가\", \"평균가격\", \"avg_price\"]\n",
    "    price_col = next((c for c in price_candidates if c in apt.columns), None)\n",
    "\n",
    "    # Long vs. wide\n",
    "    if ym_col and price_col:\n",
    "        apt_long = apt[[ym_col, \"district\", price_col]].rename(\n",
    "            columns={ym_col: \"ym\", price_col: \"APT_price\"}\n",
    "        )\n",
    "    else:\n",
    "        renamer, month_cols = {}, []\n",
    "        for c in apt.columns:\n",
    "            if c == \"district\": \n",
    "                continue\n",
    "            digits = re.sub(r\"\\D\", \"\", str(c))\n",
    "            if len(digits) >= 6:\n",
    "                renamer[c] = digits[:6]; month_cols.append(c)\n",
    "        if not month_cols:\n",
    "            raise KeyError(\"APT file looks wide but month-like columns (YYYYMM) were not found.\")\n",
    "        apt_wide = apt.rename(columns=renamer)\n",
    "        apt_long = apt_wide.melt(id_vars=[\"district\"], value_vars=[renamer[c] for c in month_cols],\n",
    "                                 var_name=\"ym\", value_name=\"APT_price\")\n",
    "\n",
    "    apt_long[\"ym\"] = norm_ym(apt_long[\"ym\"])\n",
    "    apt_long[\"APT_price\"] = (apt_long[\"APT_price\"].astype(str)\n",
    "                             .str.replace(\",\", \"\", regex=False)\n",
    "                             .replace(\"\", np.nan).astype(float))\n",
    "    # Map names → codes\n",
    "    looks_numeric = apt_long[\"district\"].astype(str).str.fullmatch(r\"\\d+\").fillna(False).mean() > 0.8\n",
    "    if looks_numeric:\n",
    "        apt_long[\"시군구\"] = apt_long[\"district\"].astype(str).str.strip()\n",
    "    else:\n",
    "        if cb is None:\n",
    "            raise ValueError(\"APT 'district' are names, but the codebook file was not found.\")\n",
    "        apt_long[\"name_norm\"] = apt_long[\"district\"].map(_norm_name_kor)\n",
    "        apt_long = apt_long.merge(cb[[\"code\",\"name_norm\"]], on=\"name_norm\", how=\"left\")\n",
    "        missing = apt_long[\"code\"].isna().sum()\n",
    "        total = len(apt_long)\n",
    "        print(f\"[INFO] Mapped APT names to codes via codebook. Unmatched rows: {missing}/{total}\")\n",
    "        apt_long[\"시군구\"] = apt_long[\"code\"].astype(str).str.strip()\n",
    "        apt_long.drop(columns=[\"code\"], inplace=True, errors=\"ignore\")\n",
    "\n",
    "    apt_long = apt_long[[\"ym\", \"시군구\", \"APT_price\"]].copy()\n",
    "    apt_long[\"ln_APT_price\"] = np.log(apt_long[\"APT_price\"])\n",
    "    return apt_long\n",
    "\n",
    "# ---- robust parquet writer (handles pyarrow/extension quirks) ----\n",
    "def _sanitize_for_parquet(df: pd.DataFrame) -> pd.DataFrame:\n",
    "    out = df.copy()\n",
    "    out.columns = [str(c) for c in out.columns]\n",
    "    for c in out.columns:\n",
    "        dt = out[c].dtype\n",
    "        if is_extension_array_dtype(dt):\n",
    "            if is_integer_dtype(dt):          # 'Int64', 'Int32', etc.\n",
    "                out[c] = out[c].astype(\"float64\")   # keeps NaN\n",
    "            else:\n",
    "                out[c] = out[c].astype(\"object\")    # string/boolean -> object\n",
    "    return out\n",
    "\n",
    "def write_parquet_robust(df: pd.DataFrame, pq_path: str, csv_path: str):\n",
    "    dfx = _sanitize_for_parquet(df)\n",
    "    try:\n",
    "        dfx.to_parquet(pq_path, index=False, engine=\"fastparquet\")\n",
    "        print(f\"[OK] wrote {pq_path} (engine=fastparquet)\")\n",
    "        return\n",
    "    except Exception as e1:\n",
    "        print(f\"[WARN] fastparquet failed: {e1}\")\n",
    "    try:\n",
    "        dfx.to_parquet(pq_path, index=False, engine=\"pyarrow\")\n",
    "        print(f\"[OK] wrote {pq_path} (engine=pyarrow)\")\n",
    "        return\n",
    "    except Exception as e2:\n",
    "        print(f\"[ERROR] Parquet failed on both engines: {e2}\")\n",
    "        dfx.to_csv(csv_path, index=False, encoding=\"utf-8-sig\")\n",
    "        print(f\"[FALLBACK] wrote CSV instead -> {csv_path}\")\n",
    "\n",
    "# =========================\n",
    "# 1) Read data\n",
    "# =========================\n",
    "df2 = read_df2_from_excel()\n",
    "codebook = load_codebook()\n",
    "apt = read_apt_prices_named_districts(codebook)\n",
    "\n",
    "# Quick audit of keys\n",
    "print(\"[CHECK] df_2 sample keys:\", df2[[\"ym\",\"출발시군구코드\",\"도착시군구코드\"]].head(3).to_dict(\"records\"))\n",
    "print(\"[CHECK] APT sample:\", apt.head(3).to_dict(\"records\"))\n",
    "\n",
    "# =========================\n",
    "# 2) Merge APT prices onto OD rows (destination & origin)\n",
    "# =========================\n",
    "price_d = apt.rename(columns={\"시군구\": \"도착시군구코드\",\n",
    "                              \"APT_price\": \"APT_price_d\",\n",
    "                              \"ln_APT_price\": \"ln_APT_price_d\"})\n",
    "price_o = apt.rename(columns={\"시군구\": \"출발시군구코드\",\n",
    "                              \"APT_price\": \"APT_price_o\",\n",
    "                              \"ln_APT_price\": \"ln_APT_price_o\"})\n",
    "\n",
    "route = df2.merge(price_d[[\"도착시군구코드\",\"ym\",\"APT_price_d\",\"ln_APT_price_d\"]],\n",
    "                  on=[\"도착시군구코드\",\"ym\"], how=\"left\")\n",
    "route = route.merge(price_o[[\"출발시군구코드\",\"ym\",\"APT_price_o\",\"ln_APT_price_o\"]],\n",
    "                    on=[\"출발시군구코드\",\"ym\"], how=\"left\")\n",
    "\n",
    "print(f\"[AUDIT] share with dest price: {route['APT_price_d'].notna().mean():.3f}\")\n",
    "print(f\"[AUDIT] share with orig price: {route['APT_price_o'].notna().mean():.3f}\")\n",
    "\n",
    "# Route-level price contrasts (H1)\n",
    "route[\"Delta_log_price\"] = route[\"ln_APT_price_d\"] - route[\"ln_APT_price_o\"]\n",
    "route[\"Rel_level_price\"] = (route[\"APT_price_d\"] - route[\"APT_price_o\"]) / route[\"APT_price_o\"]\n",
    "\n",
    "# =========================\n",
    "# 3) Destination–month aggregates (H2/H3)\n",
    "# =========================\n",
    "flow_hw_col = \"HW_총이동인구\"\n",
    "if flow_hw_col not in route.columns:\n",
    "    raise KeyError(\"Column 'HW_총이동인구' not found in df_2.xlsx.\")\n",
    "\n",
    "# Inbound subset (o != d)\n",
    "inbound = route.loc[route[\"출발시군구코드\"] != route[\"도착시군구코드\"]].copy()\n",
    "inbound_valid = inbound.dropna(subset=[\"ln_APT_price_o\", flow_hw_col]).copy()\n",
    "\n",
    "# inbound-weighted origin log price per (dest, ym)\n",
    "sum_w  = inbound_valid.groupby([\"도착시군구코드\",\"ym\"])[flow_hw_col].sum()\n",
    "sum_wx = (inbound_valid[\"ln_APT_price_o\"] * inbound_valid[flow_hw_col]) \\\n",
    "            .groupby([inbound_valid[\"도착시군구코드\"], inbound_valid[\"ym\"]]).sum()\n",
    "ln_origin_in = (sum_wx / sum_w).rename(\"ln_origin_in\").reset_index()\n",
    "\n",
    "dest = (\n",
    "    route[[\"도착시군구코드\",\"ym\",\"ln_APT_price_d\"]]\n",
    "    .drop_duplicates([\"도착시군구코드\",\"ym\"])\n",
    "    .merge(ln_origin_in, on=[\"도착시군구코드\",\"ym\"], how=\"left\")\n",
    ")\n",
    "dest[\"GapPrice\"] = dest[\"ln_APT_price_d\"] - dest[\"ln_origin_in\"]\n",
    "\n",
    "# inbound share of HW flow\n",
    "tot_hw = route.groupby([\"도착시군구코드\",\"ym\"])[flow_hw_col].sum().rename(\"HW_flow_total\")\n",
    "inb_hw = inbound.groupby([\"도착시군구코드\",\"ym\"])[flow_hw_col].sum().rename(\"HW_flow_inbound\")\n",
    "dest = dest.merge(tot_hw.reset_index(), on=[\"도착시군구코드\",\"ym\"], how=\"left\") \\\n",
    "           .merge(inb_hw.reset_index(), on=[\"도착시군구코드\",\"ym\"], how=\"left\")\n",
    "dest[\"InboundShare_HW\"] = dest[\"HW_flow_inbound\"] / dest[\"HW_flow_total\"]\n",
    "\n",
    "# Bring inequality stats from DEST already inside df_2 (constant within (d,ym))\n",
    "dest_cols = [c for c in route.columns if c.startswith(\"HW_inbound_\")\n",
    "             or c.startswith(\"HW_within_\")\n",
    "             or c.startswith(\"WH_inbound_\")\n",
    "             or c.startswith(\"WH_within_\")\n",
    "             or c.startswith(\"TOTAL_inbound_\")\n",
    "             or c.startswith(\"TOTAL_within_\")\n",
    "             or c.startswith(\"GAP_inbound_vs_within_\")]\n",
    "dest_unique = route[[\"도착시군구코드\",\"ym\"] + dest_cols].drop_duplicates([\"도착시군구코드\",\"ym\"])\n",
    "dest = dest.merge(dest_unique, on=[\"도착시군구코드\",\"ym\"], how=\"left\")\n",
    "\n",
    "# =========================\n",
    "# 4) SAVE — CSV always; Parquet if possible; optional Excel workbook\n",
    "# =========================\n",
    "# ROUTE (H1)\n",
    "route_cols = [\n",
    "    \"ym\",\"출발시군구코드\",\"출발시군구명\",\"도착시군구코드\",\"도착시군구명\",\n",
    "    \"APT_price_o\",\"APT_price_d\",\"ln_APT_price_o\",\"ln_APT_price_d\",\n",
    "    \"Delta_log_price\",\"Rel_level_price\",\n",
    "    \"HW_총이동인구\",\"WH_총이동인구\",\n",
    "    \"HW_가중평균시간\",\"WH_가중평균시간\",\"TOTAL_통근시간(분)\",\"TOTAL_플로우_보조지표(최소값)\"\n",
    "]\n",
    "route_cols = [c for c in route_cols if c in route.columns]\n",
    "route_out = route[route_cols].copy()\n",
    "\n",
    "# DEST (H2/H3)\n",
    "dest_cols_out = [\"ym\",\"도착시군구코드\",\"ln_APT_price_d\",\"ln_origin_in\",\"GapPrice\",\n",
    "                 \"HW_flow_total\",\"HW_flow_inbound\",\"InboundShare_HW\"] + dest_cols\n",
    "dest_cols_out = [c for c in dest_cols_out if c in dest.columns]\n",
    "dest_out = dest[dest_cols_out].copy()\n",
    "\n",
    "# Always write CSV\n",
    "route_out.to_csv(OUT_ROUTE_CSV, index=False, encoding=\"utf-8-sig\")\n",
    "dest_out.to_csv(OUT_DEST_CSV, index=False, encoding=\"utf-8-sig\")\n",
    "print(f\"[OK] wrote {OUT_ROUTE_CSV}\")\n",
    "print(f\"[OK] wrote {OUT_DEST_CSV}\")\n",
    "\n",
    "# Try Parquet (fastparquet → pyarrow → fallback handled inside)\n",
    "write_parquet_robust(route_out, str(OUT_ROUTE_PARQ), str(OUT_ROUTE_CSV))\n",
    "write_parquet_robust(dest_out,  str(OUT_DEST_PARQ),  str(OUT_DEST_CSV))\n",
    "\n",
    "# Optional: single Excel workbook with two sheets\n",
    "try:\n",
    "    with pd.ExcelWriter(OUT_XLSX) as xlw:\n",
    "        route_out.to_excel(xlw, sheet_name=\"ROUTE_H1\", index=False)\n",
    "        dest_out.to_excel(xlw,  sheet_name=\"DEST_H2H3\", index=False)\n",
    "    print(f\"[OK] wrote {OUT_XLSX}\")\n",
    "except Exception as e:\n",
    "    print(f\"[INFO] Excel export skipped: {e}\")\n",
    "\n",
    "# Quick peek\n",
    "print(\"\\n[HEAD ROUTE]\")\n",
    "print(route_out.head(8))\n",
    "print(\"\\n[HEAD DEST]\")\n",
    "print(dest_out.head(8))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6aeffe8a-e2b1-478e-a39c-14d1482c64f8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
